# 加载所需包和表达量数据
library(tidyverse)
fpkm <- read_tsv("data/fpkm.txt", col_names = T) # 默认有表头,csv文件可用read_csv来加载
fpkm #> # A tibble: 298 x 6
#> Species Gene AM AF LM LF
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Bmor CSP3 814. 1006. 199. 16.7
#> 2 Bmor CSP1 4410. 5338. 240. 189.
#> 3 Bmor CSP2 6253. 7568. 141. 71.8
#> 4 Bmor CSP5 72.8 78.5 16.5 11.3
#> 5 Bmor CSP4 71.6 79.5 64.8 67.1
#> 6 Bmor GR3 1914. 2058. NA NA
#> 7 Bmor GR10 348. 399. 3.34 4.07
#> 8 Bmor GR1 244. 201. NA NA
#> 9 Bmor GR12 154. 145. NA NA
#> 10 Bmor GR15 126. 125. 24.5 18.8
#> # … with 288 more rows
注:注意观察第一行所显示数据格式 tibble
# 文件写出
write_tsv(fpkm, "data/fpkm_v2.txt", col_names = T) # txt文件输出
write_csv(fpkm, "data/fpkm_v2.csv", col_names = T) # csv文件输出# 将LF列空值删除
fpkm %>% filter(!is.na(LF))
# 注意此处用到了几个语法:
# 1. %>%为管道符,即下一步;
# 2. is.na()用来判断是否为空值,!is.na()当然为非空, !is.na(LF)为判断LF列非空;
# 3. filter()为按条件过滤,满足条件地留下,不满足条件的删除;
# 4. 管道符的使用不会更改初始fpkm的变量
# 5. fpkm_filter <- fpkm %>% filter(!is.na(LF)) 可将结果导入新变量,R推荐"<-" 替代等号;
# 空值赋值为0
fpkm[is.na(fpkm)] <- 0 # fpkm中的NA值已被替换为0
fpkm #library(ggplot2)
fpkm[is.na(fpkm)] <- 0 # 空值替换为0
dat <- fpkm %>% gather(`AM`, `AF`, `LM`, `LF`, key = "Group", value = "FPKM") %>%
mutate(Name = paste0(Species, "_", Group)) %>% filter(str_detect(Gene, "GR"))
# 注意此处用到了几个语法:
# 1. %>% 管道符;
# 2. gather() 将AM AF LM LF四列数据合并为两列,即Group和FPKM;
# 3. mutate() 增加新变量;
# 4. paste0 将两个变量连起来;
# 5. filter() 条件过滤;
ggplot(data = dat, mapping = aes(x = Gene, y = log10(FPKM + 1), fill = Name)) +
geom_bar(stat="identity") + labs(x="") +
theme(axis.text.x=element_text(size = 10, angle= -45, hjust = -0.001, vjust = 1)) Fig. GR柱状图
library(pheatmap)
union <- fpkm %>% mutate(Gene_name = paste0(Species, Gene)) %>%
filter(str_detect(Gene, "OBP")) %>%
select(Gene_name, AM, AF, LM, LF)
# 注意此处用到了几个语法:
# 1. %>% 管道符;
# 2. mutate() 增加新变量;
# 3. paste0 将两个变量连起来;
# 4. filter() 条件过滤;
# 5. str_detect 字符匹配,能够从Gene中匹配到OBP为TRUE, filter()会保留TRUE的行;
# 6. select() 选择想要保留的列;
union <- as.data.frame(union) # tibble转换为data.frame格式
rownames(union) <- union[,1] # 设置首列为行名称
union <- union[,-1] # 只保留4列数据
union <- log10(union + 1) # 表达量数据差异大,应取log值
# 画图
pheatmap(union, color=colorRampPalette(rev(c("red", "linen")))(10),
legend = T, show_rownames = TRUE,
fontsize_row = 4, cellwidth = 55,
cluster_rows = F, cluster_cols = F, border_color = NA)Fig. OBP热图
library(pheatmap)
fpkm[is.na(fpkm)] <- 0 # 空值替换为0
union <- fpkm %>% mutate(Gene_name = paste0(Species, Gene)) %>%
filter(str_detect(Gene, "OBP") & Species == "Bmor") %>%
select(Gene_name, AM, AF, LM, LF)
# 注意此处用到了几个语法:
# 1. filter() 增加2个条件同时满足;
union <- as.data.frame(union) # tibble转换为data.frame格式
rownames(union)<-union[,1] # 设置首列为行名称
union<-union[,-1] # 只保留4列数据
union<-log10(union + 1) # 表达量数据差异大,应取log值
# 画图
pheatmap(union, color=colorRampPalette(rev(c("red", "linen")))(10),
legend=T, show_rownames = TRUE,
fontsize_row = 4, cellwidth = 55,
cluster_rows = F, cluster_cols = F, border_color = NA)Fig. BmorOBP热图
#> # A tibble: 234 x 11
#> manufacturer model displ year cyl trans drv cty hwy fl class
#> <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
#> 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
#> 3 audi a4 2 2008 4 manu… f 20 31 p comp…
#> 4 audi a4 2 2008 4 auto… f 21 30 p comp…
#> 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
#> 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
#> 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
#> 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp…
#> 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp…
#> 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp…
#> # … with 224 more rows
# 分面作图使用 facet_wrap()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
#> # A tibble: 336,776 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013 1 1 517 515 2 830
#> 2 2013 1 1 533 529 4 850
#> 3 2013 1 1 542 540 2 923
#> 4 2013 1 1 544 545 -1 1004
#> 5 2013 1 1 554 600 -6 812
#> 6 2013 1 1 554 558 -4 740
#> 7 2013 1 1 555 600 -5 913
#> 8 2013 1 1 557 600 -3 709
#> 9 2013 1 1 557 600 -3 838
#> 10 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
#> # A tibble: 842 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013 1 1 517 515 2 830
#> 2 2013 1 1 533 529 4 850
#> 3 2013 1 1 542 540 2 923
#> 4 2013 1 1 544 545 -1 1004
#> 5 2013 1 1 554 600 -6 812
#> 6 2013 1 1 554 558 -4 740
#> 7 2013 1 1 555 600 -5 913
#> 8 2013 1 1 557 600 -3 709
#> 9 2013 1 1 557 600 -3 838
#> 10 2013 1 1 558 600 -2 753
#> # … with 832 more rows, and 12 more variables: sched_arr_time <int>,
#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
#> # A tibble: 55,403 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013 11 1 5 2359 6 352
#> 2 2013 11 1 35 2250 105 123
#> 3 2013 11 1 455 500 -5 641
#> 4 2013 11 1 539 545 -6 856
#> 5 2013 11 1 542 545 -3 831
#> 6 2013 11 1 549 600 -11 912
#> 7 2013 11 1 550 600 -10 705
#> 8 2013 11 1 554 600 -6 659
#> 9 2013 11 1 554 600 -6 826
#> 10 2013 11 1 554 600 -6 749
#> # … with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
flights_select <- flights %>% select(year:day, ends_with("delay"), distance, air_time)
# 从 flights中选择 从year到day列,以delay结尾的列,及distance、air_time列
# 并将所选内容赋值新变量 flights_select
flights_select#> # A tibble: 336,776 x 7
#> year month day dep_delay arr_delay distance air_time
#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 2013 1 1 2 11 1400 227
#> 2 2013 1 1 4 20 1416 227
#> 3 2013 1 1 2 33 1089 160
#> 4 2013 1 1 -1 -18 1576 183
#> 5 2013 1 1 -6 -25 762 116
#> 6 2013 1 1 -4 12 719 150
#> 7 2013 1 1 -5 19 1065 158
#> 8 2013 1 1 -3 -14 229 53
#> 9 2013 1 1 -3 -8 944 140
#> 10 2013 1 1 -2 8 733 138
#> # … with 336,766 more rows
#> # A tibble: 336,776 x 9
#> year month day dep_delay arr_delay distance air_time gain speed
#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2013 1 1 2 11 1400 227 9 370.
#> 2 2013 1 1 4 20 1416 227 16 374.
#> 3 2013 1 1 2 33 1089 160 31 408.
#> 4 2013 1 1 -1 -18 1576 183 -17 517.
#> 5 2013 1 1 -6 -25 762 116 -19 394.
#> 6 2013 1 1 -4 12 719 150 16 288.
#> 7 2013 1 1 -5 19 1065 158 24 404.
#> 8 2013 1 1 -3 -14 229 53 -11 259.
#> 9 2013 1 1 -3 -8 944 140 -5 405.
#> 10 2013 1 1 -2 8 733 138 10 319.
#> # … with 336,766 more rows
# 只保留新变量使用 transmute()
flights_select %>% transmute(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)#> # A tibble: 336,776 x 2
#> gain speed
#> <dbl> <dbl>
#> 1 9 370.
#> 2 16 374.
#> 3 31 408.
#> 4 -17 517.
#> 5 -19 394.
#> 6 16 288.
#> 7 24 404.
#> 8 -11 259.
#> 9 -5 405.
#> 10 10 319.
#> # … with 336,766 more rows
#> # A tibble: 365 x 4
#> # Groups: year, month [?]
#> year month day mean
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 11.5
#> 2 2013 1 2 13.9
#> 3 2013 1 3 11.0
#> 4 2013 1 4 8.95
#> 5 2013 1 5 5.73
#> 6 2013 1 6 7.15
#> 7 2013 1 7 5.42
#> 8 2013 1 8 2.55
#> 9 2013 1 9 2.28
#> 10 2013 1 10 2.84
#> # … with 355 more rows
| Command | Notes | 用法 |
|---|---|---|
| cd | 切换工作路径 | cd 目录名称 |
| mkdir | 新建工作目录 | mkdir [参数] 目录名称 |
| less | 查看文件 | less file |
| head | 显示前10行 | head [参数] file |
| tail | 显示末尾10行 | tail [参数] file |
| wc | 统计 | wc [参数] file |
| grep | 提取 | grep [参数] file |
| pwd | 当前工作目录 | pwd |
| top | 动态监视进程 | top |
| kill | 杀死进程 | kill[参数] [进程PID] |
| wget | 下载 | wget [参数] 下载地址 |
| … | … | … |
#> 298 /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv
# 将OBP替换为obp
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | grep 'OBP'| \
sed 's/OBP/obp/'| head
# 注意此处用到的语句:
# 1. less 查看文件
# 2. | 为linux中的管道符,使用频率非常高
# 3. grep 为提取能够匹配的行
# 4. sed 替换
# 5. head 只显示前十行;对应的tail只显示末尾10行#> Bmor,obp17,237624.7484,279357.8935,120852.7772,91312.67478
#> Bmor,obp19,144133.6656,157549.734,756.9491698,515.0887552
#> Bmor,obp18,81361.47057,100905.594,199.3020776,106.6935315
#> Bmor,obp32,42469.90418,50627.90657,8004.179801,7513.335457
#> Bmor,obp25,104198.5474,116736.2514,301.05281,223.1686628
#> Bmor,obp22,17423.89138,22891.70397,33701.86007,29457.46544
#> Bmor,obp10,37114.76776,43113.01742,87.91817399,38.59747501
#> Bmor,obp34,15082.7183,14119.80703,28.72357294,20.76104541
#> Bmor,obp9,6898.002276,8802.937048,14.72778576,9.906821138
#> Bmor,obp26,30443.1707,32829.36014,0.655371554,0.634759894
# 在行前加数字
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | \
awk -F '\,' 'BEGIN{n=1}{print n"\t"$1"\t"$2"\t"$(NF-1)"\t"$NF; n++}' | \
head
# 注意此处用到的语句:
# 1. -F ',' 指以逗号分割
# 2. $1为第一列,$NF为最后一列, $(NF-1)为倒数第二列
# 3. head 只打印出前10行,避免打印内容太多造成刷屏#> 1 Species Gene LM LF
#> 2 Bmor CSP3 199.30776 16.6935315
#> 3 Bmor CSP1 239.80977 188.73617
#> 4 Bmor CSP2 140.6124347 71.78603079
#> 5 Bmor CSP5 16.49506383 11.3499129
#> 6 Bmor CSP4 64.75705246 67.10626644
#> 7 Bmor GR3
#> 8 Bmor GR10 3.337080858 4.074575804
#> 9 Bmor GR1
#> 10 Bmor GR12
# 累加求和
# 以求OBP LM FPKM加和为例
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | \
grep 'OBP' | \
awk -F ',' '{sum=sum+$3}END{print "OBP_LM FPKM\t"sum}' #> OBP_LM FPKM 3.04498e+06
# fastq2fasta
less /Users/liang/Desktop/github/Bioinformatics_learn/data/test.fastq | \
awk '{if(NR%4 == 1){print ">" substr($0, 2)}}{if(NR%4 == 2){print}}' | \
head -4#> >E00495:452:HN2THCCXY:7:1101:11119:1520 1:N:0:GACTAGTA
#> CAGCAACAGTGCTATTAAAAGTCCTACCACGCCAAACCAATTATGTGTACGACCATTGCTTCTCGTATCCATCATATATTAGTTATTTGTAAATATATCATTTTAAATATGCAAAGTTACTGATTTTTATAATGATTCGAGCAGATCGGA
#> >E00495:452:HN2THCCXY:7:1101:16478:1520 1:N:0:GACTAGTA
#> CCCTGCAAGGCGCCGTATTCCATCTGCCACCGGGAGACATCATCTGCCTGACCGGTTCCGTATTATAAGCCCTTTAGATCTCACTGAACGCCCGCCTGGCCCGGCAACACGACCCAGTGCTGCAGGGAATCTTGCAAATGGGCTTACTCA
vim是linux编辑器,需要自行上网查询,使用频率非常高,掌握后对于编辑文件非常方便;
另 微软VScode 可实现remote-ssh登陆服务器,可实现服务端文件编辑,非常方便,极力推荐!!!
以上运行linux命令均可写为shell bash脚本,直接运行bash *.sh即可,方便运行,还可整合为流程脚本,非常方便!!!
# 仍以fastq2fasta为例,将linux命令写入bash脚本
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta.sh#> #/bin/bash
#>
#> less /Users/liang/Desktop/github/Bioinformatics_learn/data/test.fastq | \
#> awk '{if(NR%4 == 1){print ">" substr($0, 2)}}{if(NR%4 == 2){print}}'
# 仍以fastq2fasta为例, 运行脚本
bash /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta.sh | head -4#> >E00495:452:HN2THCCXY:7:1101:11119:1520 1:N:0:GACTAGTA
#> CAGCAACAGTGCTATTAAAAGTCCTACCACGCCAAACCAATTATGTGTACGACCATTGCTTCTCGTATCCATCATATATTAGTTATTTGTAAATATATCATTTTAAATATGCAAAGTTACTGATTTTTATAATGATTCGAGCAGATCGGA
#> >E00495:452:HN2THCCXY:7:1101:16478:1520 1:N:0:GACTAGTA
#> CCCTGCAAGGCGCCGTATTCCATCTGCCACCGGGAGACATCATCTGCCTGACCGGTTCCGTATTATAAGCCCTTTAGATCTCACTGAACGCCCGCCTGGCCCGGCAACACGACCCAGTGCTGCAGGGAATCTTGCAAATGGGCTTACTCA
# 整理流程需要传参
# 将fastq文件作为参数传输
bash /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta_argv.sh \
/Users/liang/Desktop/github/Bioinformatics_learn/data/test.fastq | \
head -4 #> >E00495:452:HN2THCCXY:7:1101:11119:1520 1:N:0:GACTAGTA
#> CAGCAACAGTGCTATTAAAAGTCCTACCACGCCAAACCAATTATGTGTACGACCATTGCTTCTCGTATCCATCATATATTAGTTATTTGTAAATATATCATTTTAAATATGCAAAGTTACTGATTTTTATAATGATTCGAGCAGATCGGA
#> >E00495:452:HN2THCCXY:7:1101:16478:1520 1:N:0:GACTAGTA
#> CCCTGCAAGGCGCCGTATTCCATCTGCCACCGGGAGACATCATCTGCCTGACCGGTTCCGTATTATAAGCCCTTTAGATCTCACTGAACGCCCGCCTGGCCCGGCAACACGACCCAGTGCTGCAGGGAATCTTGCAAATGGGCTTACTCA
| 类别 | 软件 | 软件说明 | 备注 |
|---|---|---|---|
| 质控 | fastqc | 下机数据质控 | github / 官网 / 软件安装及使用 |
| 质控 | Trimmomatic | 下机数据质控 | github / 软件安装及使用 / paper |
| 质控 | SolexaQA | 下机数据质控 | 软件安装及使用 / 官网 paper |
| SRA解压 | fastq-dump | 解压NCBI SRA格式为fastq格式 | 软件安装及使用 |
| 比对 | blast | NCBI 比对软件 | 软件安装及使用 / 算法 |
| 比对 | bwa | 短序列比对软件 | 软件安装及使用 / 算法 |
| 比对 | bowtie2 | 短序列比对软件 | 软件安装及使用 / 算法 |
| 比对 | kallisto | 基于kmer比对软件, 速度极快 | 软件安装及使用 / paper |
| 比对 | STAR | RNA-seq序列比对工具 | 软件安装及使用 / github |
| sam处理 | samtools | 操作sam和bam文件的工具合集 | 软件安装及使用 / github |
| 组装 | SOAPdenovo | 华大出品组装软件 | 软件使用 |
| 组装 | SPAdes | 基因组组装软件 | 软件安装及使用 |
| 组装 | IDBA | 基因组组装软件 | 软件安装及使用 |
| 转录组组装 | Trinity | 转录组无参组装 | 软件安装及使用 |
| 有参转录组 | TopHat | 有参转录组比对软件 | 软件安装及使用 |
| 有参转录组 | Cufflinks | 用于基因表达量的计算和差异表达基因的寻找 | 软件安装及使用 |
| 聚类 | |||
| 构树 |
2014.08 - 2015.12 : 植保所昆功组 - 转录组 - 生信入门 + 自学;
2015.12 - 2017.05 : 诺禾致源 - 基因组 - perl/R/bash;
2017.06 - 2018.12 : 量化健康 - 宏基因组 - perl/R/bash/python/docker/git/ML;
2018.12 - 至今 : 先声诊断 - Nanopore - 数据库/机器学习 + 成长中~;
---
title: "生信入门基础"
author: "Liang XZ"
output:
flexdashboard::flex_dashboard:
html_document: default
vertical_layout: fill
social: menu
source_code: embed
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(collapse = TRUE,comment = "#>")
options(kableExtra.latex.load_packages = FALSE)
library(kableExtra)
library(magrittr)
knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning=FALSE, out.width = '100%')
xfun::session_info('DT')
library(flexdashboard)
library(tidyverse)
library(ggplot2)
library(data.table)
```
Index
=====================================
Inputs {.sidebar}
-------------------------------------
**目录**
* 引言:生信图谱
* 第一章:R语言
* 第二章:Linux基础
* 第三章:生信常用软件及格式
* 第四章:Github介绍
* 第五章:生信流程介绍
Column {.tabset}
-------------------------------------
### 引言
```{r, eval = T, echo = F, size="small", fig.align = 'right', fig.height = 15, fig.width = 14, out.width="100%", cache=FALSE,warning=FALSE}
knitr::include_graphics("data/cover.png")
```
第一章
=====================================
Inputs {.sidebar}
-------------------------------------
**第一章 R语言**
* 学会使用tidyverse包
* 掌握tibble数据格式
* 学会数据导入/输出
* 数据基本操作
* ggplot2画图
* Rmarkdown
Column {.tabset}
-------------------------------------
### 1. Rstudio平台
#### 1.1 书籍推荐
* [**R for data science**](http://www.allitebooks.org/r-for-data-science-2/)
* [**Github**](https://github.com/hadley/r4ds)
#### 1.2 R语言安装
* 安装方法参考[网页链接](https://zhuanlan.zhihu.com/p/31161726)
* Windows/Mac : [官网下载](https://mirrors.tuna.tsinghua.edu.cn/CRAN/)
#### 1.3 Rstudio安装
* Windows/Mac: [官网下载](https://www.rstudio.com/products/rstudio/download/#download)
* Rstudio工作界面
```{r, eval = T, echo = F, size="small", fig.align = 'center', fig.height = 3, fig.width = 3, out.width="50%", cache=FALSE,warning=FALSE}
knitr::include_graphics("data/Rstudio.png")
```
#### 1.4 R常用包安装/加载
* 画图:ggplot2
```{r, eval = F, echo = T}
# 安装包,此处引号不可省略
install.packages("ggplot2")
# 加载
library(ggplot2)
# 了解用法
?ggplot
```
* tibble数据格式: tidyverse
```{r, eval = F, echo = T}
# 安装包
install.packages("tidyverse")
# 加载
library(tidyverse)
# 了解用法
?tidyverse
```
* 判断安装/加载
```{r, eval = F, echo = T}
# 判断包是否安装,如未安装刚先安装再加载
if(!require(tidyverse))install.packages("tidyverse")
library(tidyverse)
```
### 2. tidyverse
#### 2.0 网上资源
* [初学者学习tidyverse](https://www.jianshu.com/p/f3c21a5ad10a)
* [数据清洗转换](https://blog.csdn.net/ss_fisher/article/details/80919026)
* [tidyverse基础知识汇总](https://www.cnblogs.com/YangCool/p/9944217.html)
#### 2.1 数据导入
```{r, eval = T, echo = T}
# 加载所需包和表达量数据
library(tidyverse)
fpkm <- read_tsv("data/fpkm.txt", col_names = T) # 默认有表头,csv文件可用read_csv来加载
fpkm
```
注:注意观察第一行所显示数据格式 tibble
#### 2.2 文件输出
```{r, eval = F, echo = T}
# 文件写出
write_tsv(fpkm, "data/fpkm_v2.txt", col_names = T) # txt文件输出
write_csv(fpkm, "data/fpkm_v2.csv", col_names = T) # csv文件输出
```
#### 2.3 空值处理
```{r, eval = F, echo = T}
# 将LF列空值删除
fpkm %>% filter(!is.na(LF))
# 注意此处用到了几个语法:
# 1. %>%为管道符,即下一步;
# 2. is.na()用来判断是否为空值,!is.na()当然为非空, !is.na(LF)为判断LF列非空;
# 3. filter()为按条件过滤,满足条件地留下,不满足条件的删除;
# 4. 管道符的使用不会更改初始fpkm的变量
# 5. fpkm_filter <- fpkm %>% filter(!is.na(LF)) 可将结果导入新变量,R推荐"<-" 替代等号;
# 空值赋值为0
fpkm[is.na(fpkm)] <- 0 # fpkm中的NA值已被替换为0
fpkm
```
#### 2.4 画柱状图
```{r, eval = T, echo = T, fig.align = 'center', fig.height = 6, fig.width = 10, out.width="50%", fig.showtext = T, cache=FALSE, fig.cap="Fig. GR柱状图", warning=FALSE}
#library(ggplot2)
fpkm[is.na(fpkm)] <- 0 # 空值替换为0
dat <- fpkm %>% gather(`AM`, `AF`, `LM`, `LF`, key = "Group", value = "FPKM") %>%
mutate(Name = paste0(Species, "_", Group)) %>% filter(str_detect(Gene, "GR"))
# 注意此处用到了几个语法:
# 1. %>% 管道符;
# 2. gather() 将AM AF LM LF四列数据合并为两列,即Group和FPKM;
# 3. mutate() 增加新变量;
# 4. paste0 将两个变量连起来;
# 5. filter() 条件过滤;
ggplot(data = dat, mapping = aes(x = Gene, y = log10(FPKM + 1), fill = Name)) +
geom_bar(stat="identity") + labs(x="") +
theme(axis.text.x=element_text(size = 10, angle= -45, hjust = -0.001, vjust = 1))
```
#### 2.5 画OBP热图
```{r, eval = T, echo = T, fig.align = 'center', fig.height = 6, fig.width = 4, out.width="40%", fig.showtext = T, cache=FALSE, fig.cap="Fig. OBP热图", warning=FALSE}
library(pheatmap)
union <- fpkm %>% mutate(Gene_name = paste0(Species, Gene)) %>%
filter(str_detect(Gene, "OBP")) %>%
select(Gene_name, AM, AF, LM, LF)
# 注意此处用到了几个语法:
# 1. %>% 管道符;
# 2. mutate() 增加新变量;
# 3. paste0 将两个变量连起来;
# 4. filter() 条件过滤;
# 5. str_detect 字符匹配,能够从Gene中匹配到OBP为TRUE, filter()会保留TRUE的行;
# 6. select() 选择想要保留的列;
union <- as.data.frame(union) # tibble转换为data.frame格式
rownames(union) <- union[,1] # 设置首列为行名称
union <- union[,-1] # 只保留4列数据
union <- log10(union + 1) # 表达量数据差异大,应取log值
# 画图
pheatmap(union, color=colorRampPalette(rev(c("red", "linen")))(10),
legend = T, show_rownames = TRUE,
fontsize_row = 4, cellwidth = 55,
cluster_rows = F, cluster_cols = F, border_color = NA)
```
#### 2.6 物种BmorOBP热图
```{r, eval = T, echo = T, fig.align = 'center', fig.height = 6, fig.width = 4, out.width="40%", fig.showtext = T, cache=FALSE, fig.cap="Fig. BmorOBP热图", warning=FALSE}
library(pheatmap)
fpkm[is.na(fpkm)] <- 0 # 空值替换为0
union <- fpkm %>% mutate(Gene_name = paste0(Species, Gene)) %>%
filter(str_detect(Gene, "OBP") & Species == "Bmor") %>%
select(Gene_name, AM, AF, LM, LF)
# 注意此处用到了几个语法:
# 1. filter() 增加2个条件同时满足;
union <- as.data.frame(union) # tibble转换为data.frame格式
rownames(union)<-union[,1] # 设置首列为行名称
union<-union[,-1] # 只保留4列数据
union<-log10(union + 1) # 表达量数据差异大,应取log值
# 画图
pheatmap(union, color=colorRampPalette(rev(c("red", "linen")))(10),
legend=T, show_rownames = TRUE,
fontsize_row = 4, cellwidth = 55,
cluster_rows = F, cluster_cols = F, border_color = NA)
```
### 3. tibble
#### 3.1 了解tibble数据格式
```{r, eval = T, echo = T}
# R中传统的data.frame是很老的数据结构,而在新的tidyverse框架中提出了新的tibble来替代;
# 查看tidyverse内置数据mpg
mpg
```
#### 3.2 ggplot画个图
* [ggplot画图](https://www.jianshu.com/p/9c1065904d53)
* [ggplot分面画囷](https://blog.csdn.net/u014801157/article/details/24372507)
```{r, eval = T, echo = T, fig.align = 'center', fig.height = 6, fig.width = 10, out.width="60%", fig.showtext = T, cache=FALSE, fig.cap="", warning=FALSE}
# 散点图
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))
# 分面作图使用 facet_wrap()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
# 柱状图
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = cut))
# 箱形图
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
# 箱形图(x y轴互换)
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() + coord_flip()
```
#### 3.3 tibble基本操作
* 过滤 filter()
```{r, eval = T, echo = T}
# 使用内置数据 flights
library(nycflights13)
flights
# 过滤 filter()
flights %>% filter(month == 1, day == 1)
# 单变量多条件过滤
flights %>% filter( month %in% c(11, 12))
# 更多精彩内容详见 R for data science 一书!!!
```
* 数据选择 select()
```{r, eval = T, echo = T}
flights_select <- flights %>% select(year:day, ends_with("delay"), distance, air_time)
# 从 flights中选择 从year到day列,以delay结尾的列,及distance、air_time列
# 并将所选内容赋值新变量 flights_select
flights_select
```
* 增加新变量 mutate()
```{r, eval = T, echo = T}
flights_select %>% mutate(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)
# 只保留新变量使用 transmute()
flights_select %>% transmute(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)
```
* 统计 summarize(), 和分组 group_by()联用
```{r, eval = T, echo = T}
flights %>% group_by( year, month, day) %>%
summarize(mean = mean(dep_delay, na.rm = TRUE))
# 更多精彩内容详见 R for data science 一书!!!
```
第二章
===================================
Inputs {.sidebar}
-------------------------------------
**第二章 Linux基础**
* 掌握基础linux命令
* sed/awk/vim工具进阶
* 编写shell脚本
Column {.tabset}
-------------------------------------
### 1. linux基础
#### 1.0 网络资源
* [Linux入门学习](https://blog.csdn.net/zhanglijingCSDN/article/details/82494312)
* [Linux新手入门教程](https://blog.csdn.net/zhanglijingCSDN/article/details/82494312)
#### 1.1 必掌握基础
```{r}
command_list <- "
Command; Notes; 用法
cd ;切换工作路径 ; cd 目录名称
mkdir ;新建工作目录 ; mkdir [参数] 目录名称
less ;查看文件 ; less file
head ;显示前10行 ; head [参数] file
tail ;显示末尾10行 ; tail [参数] file
wc ;统计 ; wc [参数] file
grep ;提取 ; grep [参数] file
pwd ;当前工作目录 ; pwd
top ;动态监视进程 ; top
kill ;杀死进程 ; kill[参数] [进程PID]
wget ;下载 ; wget [参数] 下载地址
... ;... ;...
"
command_dat <- read.delim(text = command_list, stringsAsFactors = FALSE,header = TRUE, sep = ";",
check.names = FALSE)
knitr::kable(command_dat, booktabs = TRUE, longtable = FALSE, escape = TRUE, align = 'c', linesep = '\\addlinespace', booktabs = TRUE) %>%
kable_styling(full_width = T, latex_options = c("hold_position")) %>%
add_header_above(c("表. Linux必掌握基础命令" = 3), bold = T)
```
### 2. linux进阶
#### 2.1 sed
```{bash, eval = T, echo = T}
# 计数统计
wc -l /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv
```
```{bash, eval = T, echo = T}
# 将OBP替换为obp
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | grep 'OBP'| \
sed 's/OBP/obp/'| head
# 注意此处用到的语句:
# 1. less 查看文件
# 2. | 为linux中的管道符,使用频率非常高
# 3. grep 为提取能够匹配的行
# 4. sed 替换
# 5. head 只显示前十行;对应的tail只显示末尾10行
```
#### 2.2 awk
```{bash, eval = T, echo = T}
# 在行前加数字
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | \
awk -F '\,' 'BEGIN{n=1}{print n"\t"$1"\t"$2"\t"$(NF-1)"\t"$NF; n++}' | \
head
# 注意此处用到的语句:
# 1. -F ',' 指以逗号分割
# 2. $1为第一列,$NF为最后一列, $(NF-1)为倒数第二列
# 3. head 只打印出前10行,避免打印内容太多造成刷屏
```
```{bash, eval = T, echo = T}
# 累加求和
# 以求OBP LM FPKM加和为例
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fpkm.csv | \
grep 'OBP' | \
awk -F ',' '{sum=sum+$3}END{print "OBP_LM FPKM\t"sum}'
```
```{bash, eval = T, echo = T}
# fastq2fasta
less /Users/liang/Desktop/github/Bioinformatics_learn/data/test.fastq | \
awk '{if(NR%4 == 1){print ">" substr($0, 2)}}{if(NR%4 == 2){print}}' | \
head -4
```
#### 2.3 vim
vim是linux编辑器,需要自行上网查询,使用频率非常高,掌握后对于编辑文件非常方便;
另 微软VScode 可实现remote-ssh登陆服务器,可实现服务端文件编辑,非常方便,极力推荐!!!
#### 2.4 shell/bash
以上运行linux命令均可写为shell bash脚本,直接运行bash *.sh即可,方便运行,还可整合为流程脚本,非常方便!!!
```{bash, eval = T, echo = T}
# 仍以fastq2fasta为例,将linux命令写入bash脚本
less /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta.sh
```
```{bash, eval = T, echo = T}
# 仍以fastq2fasta为例, 运行脚本
bash /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta.sh | head -4
```
实现脚本传参
```{bash, eval = T, echo = T}
# 整理流程需要传参
# 将fastq文件作为参数传输
bash /Users/liang/Desktop/github/Bioinformatics_learn/data/fastq2fasta_argv.sh \
/Users/liang/Desktop/github/Bioinformatics_learn/data/test.fastq | \
head -4
```
第三章
===================================
Inputs {.sidebar}
-------------------------------------
**第三章 生信常用软件及格式**
* 了解常用生信软件
* 了解常用数据格式
* NCBI资源利用
Column {.tabset}
-------------------------------------
### 生信常用软件
```{r eval=TRUE, echo=FALSE, results='markup', include=TRUE, cache=FALSE, message=FALSE, warning=FALSE, error=TRUE, fig.cap=NULL, out.width='100%'}
text <- "
类别 | 软件 | 软件说明 | 备注
质控 | fastqc| 下机数据质控 | [github](https://github.com/s-andrews/FastQC) / [官网](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) / [软件安装及使用](https://www.cnblogs.com/lmt921108/p/6900416.html)
质控 | Trimmomatic| 下机数据质控| [github](https://github.com/timflutre/trimmomatic) / [软件安装及使用](http://www.usadellab.org/cms/index.php?page=trimmomatic) / [paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
质控 | SolexaQA| 下机数据质控 | [软件安装及使用](http://blog.sina.com.cn/s/blog_7f1542270101gccz.html) / [官网](http://solexaqa.sourceforge.net/) [paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-485)
SRA解压 | fastq-dump | 解压NCBI SRA格式为fastq格式| [软件安装及使用](https://blog.csdn.net/j_fun/article/details/60745262)
比对 | blast | NCBI 比对软件 | [软件安装及使用](https://blog.csdn.net/flyfrommath/article/details/52945586) / [算法](https://blog.csdn.net/huangliangbo0805/article/details/40868269)
比对 | bwa | 短序列比对软件 | [软件安装及使用](https://www.cnblogs.com/emanlee/p/4316573.html) / [算法](https://www.cnblogs.com/leezx/p/6226717.html)
比对 | bowtie2 | 短序列比对软件| [软件安装及使用](http://blog.sciencenet.cn/blog-830496-750216.html) / [算法](http://blog.sina.com.cn/s/blog_751bd9440102v2yj.html)
比对 | kallisto | 基于kmer比对软件, 速度极快 | [软件安装及使用](http://blog.sina.com.cn/s/blog_1704ff73a0102wzkl.html) / [paper](https://www.nature.com/articles/nbt.3519)
比对 | STAR | RNA-seq序列比对工具| [软件安装及使用](https://www.jianshu.com/p/fa388b21d1de) / [github](https://github.com/alexdobin/STAR)
sam处理 | samtools | 操作sam和bam文件的工具合集 | [软件安装及使用](https://www.jianshu.com/p/6b7a442d293f) / [github](https://github.com/samtools/samtools)
组装 | SOAPdenovo| 华大出品组装软件 | [软件使用](https://www.cnblogs.com/Formulate0303/p/6879841.html)
组装 | SPAdes |基因组组装软件 | [软件安装及使用](https://www.plob.org/article/7861.html)
组装 | IDBA | 基因组组装软件 |[软件安装及使用](http://118.31.76.100:100/ngs/bin/idba-ud/)
转录组组装 | Trinity | 转录组无参组装 | [软件安装及使用](https://www.jianshu.com/p/8518a23255f8)
有参转录组 | TopHat | 有参转录组比对软件 | [软件安装及使用](https://blog.csdn.net/g863402758/article/details/78454818)
有参转录组 | Cufflinks | 用于基因表达量的计算和差异表达基因的寻找 | [软件安装及使用](https://blog.csdn.net/g863402758/article/details/52965752)
聚类 | | |
构树 | | |
"
df <- read.delim(text = text, stringsAsFactors = FALSE,
header = TRUE, sep = "|",
check.names = FALSE)
knitr::kable(df, booktabs = TRUE, longtable = FALSE, escape = TRUE, align = 'c', linesep = '\\addlinespace', booktabs = TRUE) %>%
kable_styling(full_width = T, latex_options = c("hold_position")) %>% add_header_above(c("常用生信软件集合" = 4), bold = T)
```
### 生信常用数据格式
```{r eval=TRUE, echo=FALSE, results='markup', include=TRUE, cache=FALSE, message=FALSE, warning=FALSE, error=TRUE, fig.cap=NULL, out.width='100%'}
text <- "
数据格式 | 格式介绍 | 备注
[fastq](https://blog.csdn.net/ltbylc/article/details/24346231) | 下机数据存储格式 | 下机原始数据,多以fastq/fq或压缩格式为后缀
[fasta](https://blog.csdn.net/ltbylc/article/details/24346231) | 核苷酸序列或氨基酸序列格式 | 序列存储格式,多以fasta/fna/fa为后缀
[NCBI SRA数据库](https://blog.csdn.net/u012150360/article/details/70556186) | NCBI 存储reads格式| wget下载后使用fastq-dump解压成fastq格式
[blast m8](https://shengxin.ren/article/46) | blast比对输出格式 | blast+对应的参数是-outfmt 6
[gff](https://www.cnblogs.com/djx571/p/9497707.html) | 基因组注释信息格式 | gff/gtf是贮存注释信息的文件格式
[sam](https://blog.csdn.net/u012150360/article/details/70556186) | 比对结果格式 | bwa / bowtie输出格式
[vcf](https://blog.csdn.net/u012150360/article/details/70556186) | 突变信息的文本格式 | 使用bcftools软件处理
"
df <- read.delim(text = text, stringsAsFactors = FALSE,
header = TRUE, sep = "|",
check.names = FALSE)
knitr::kable(df, booktabs = TRUE, longtable = FALSE, escape = TRUE, align = 'c', linesep = '\\addlinespace', booktabs = TRUE) %>%
kable_styling(full_width = T, latex_options = c("hold_position")) %>% add_header_above(c("常用生信数据格式集合" = 3), bold = T)
```
About
====================================
Inputs {.sidebar}
-------------------------------------
Column
-------------------------------------
### **欢迎入坑**
* 生信命令不需要背,但需掌握常用的一些命令;
* 遇到问题一定要百度、谷歌、必应搜索,你遇到的问题或许别人已经给出解决方案;
* 学以致用效率最高,建议从 [**R for data science**](http://www.allitebooks.org/r-for-data-science-2/) 入手,掌握一些基础数据分析、画图能力;
* 循序渐进,逐步入门,共同进步~
### **说明**
* 生信入门三年,学点儿皮毛,整理不完善之处请谅解;
* 简单整理这几年入门趟的坑儿,希望能够入门效率高些;
* 整理基础知识,也是对自己知识的沉淀,得空余时间整理,速度偏慢;
* ...
### **生信履历**
```
2014.08 - 2015.12 : 植保所昆功组 - 转录组 - 生信入门 + 自学;
2015.12 - 2017.05 : 诺禾致源 - 基因组 - perl/R/bash;
2017.06 - 2018.12 : 量化健康 - 宏基因组 - perl/R/bash/python/docker/git/ML;
2018.12 - 至今 : 先声诊断 - Nanopore - 数据库/机器学习 + 成长中~;
```